Computer Lab 1

Author

Edward Mckenzie

Load Library’s and Data, Initial Plot

rm(list=ls()) # Remove variables 
cat("\014") # Clean workspace
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.3     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(splines)
library(glmnet)
Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack

Loaded glmnet 4.1-7
library(timetk)
library(caret)
Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift
library(tree)
library(rdist)

# Q1 
# Load Data

bike_data <- read.csv('bike_rental_hourly.csv')

bike_data$log_cnt <- log(bike_data$cnt)
bike_data$hour <- bike_data$hr/23

bike_data_train <- bike_data[bike_data$dteday >= as.Date("2011-02-01") & bike_data$dteday <= as.Date("2011-03-31"), ]
bike_data_test <- bike_data[bike_data$dteday >= as.Date("2011-04-01") & bike_data$dteday <=  as.Date("2011-05-31"), ]

y_test <- bike_data_test$log_cnt
y_train <- bike_data_train$log_cnt
p <- 2
X_train <- cbind(1, poly(bike_data_train$hour, 2, raw = TRUE, simple = TRUE))
beta_hat <- solve(t(X_train)%*%X_train)%*%t(X_train)%*%y_train

# Predict in-sample and compute RMSE
y_hat_train <- X_train%*%beta_hat 
RMSE_train <- sqrt(sum((y_train - y_hat_train)^2)/length(y_train))

# Predict out-of-sample and compute RMSE
X_test <- cbind(1, poly(bike_data_test$hour, p, raw = TRUE, simple = TRUE))
y_hat_test <- X_test%*%beta_hat
RMSE_test <- sqrt(sum((y_test - y_hat_test)^2)/length(y_test))

# Plot training data, test data, and fit on a fine grid.
plot(log_cnt ~ hour, data = bike_data_train, col = "cornflowerblue", ylim = c(0, 8))
lines(bike_data_test$hour, bike_data_test$log_cnt, type = "p", col = "lightcoral")
hours_grid <- seq(0, 1, length.out = 1000)
X_grid <- cbind(1, poly(hours_grid, p, raw = TRUE, simple = TRUE))
y_hat_grid <- X_grid%*%beta_hat
lines(hours_grid, y_hat_grid, lty = 1, col = "lightcoral")
legend(x = "topleft", pch = c(1, 1, NA), lty = c(NA, NA, 1), col = c("cornflowerblue", "lightcoral",  "lightcoral"), legend=c("Train", "Test", "Fitted curve"))

Q1

1.1

y_test <- bike_data_test$log_cnt
y_train <- bike_data_train$log_cnt
p <- 8
X_train <- cbind(1, poly(bike_data_train$hour, p, raw = TRUE, simple = TRUE))
beta_hat <- solve(t(X_train)%*%X_train)%*%t(X_train)%*%y_train

# Predict in-sample and compute RMSE
y_hat_train <- X_train%*%beta_hat 
RMSE_train <- sqrt(sum((y_train - y_hat_train)^2)/length(y_train))

# Predict out-of-sample and compute RMSE
X_test <- cbind(1, poly(bike_data_test$hour, p, raw = TRUE, simple = TRUE))
y_hat_test <- X_test%*%beta_hat
RMSE_test <- sqrt(sum((y_test - y_hat_test)^2)/length(y_test))

# Plot training data, test data, and fit on a fine grid.
plot(log_cnt ~ hour, data = bike_data_train, col = "cornflowerblue", ylim = c(0, 8), main = "8th Order Polynomial Regression")
lines(bike_data_test$hour, bike_data_test$log_cnt, type = "p", col = "lightcoral")
hours_grid <- seq(0, 1, length.out = 1000)
X_grid <- cbind(1, poly(hours_grid, p, raw = TRUE, simple = TRUE))
y_hat_grid <- X_grid%*%beta_hat
lines(hours_grid, y_hat_grid, lty = 1, col = "lightcoral")
legend(x = "topleft", pch = c(1, 1, NA), lty = c(NA, NA, 1), col = c("cornflowerblue", "lightcoral",  "lightcoral"), legend=c("Train", "Test", "Fitted curve"))

1.2

RMSE_train <- numeric(length = 10)
RMSE_test <- numeric(length = 10)

for (i in 1:10) {
  p <- i
  X_train <- cbind(1, poly(bike_data_train$hour, p, raw = TRUE, simple = TRUE))
  beta_hat <- solve(t(X_train)%*%X_train)%*%t(X_train)%*%y_train
  
  # Predict in-sample and compute RMSE
  y_hat_train <- X_train%*%beta_hat 
  RMSE_train[i] <- sqrt(sum((y_train - y_hat_train)^2)/length(y_train))
  
  # Predict out-of-sample and compute RMSE
  X_test <- cbind(1, poly(bike_data_test$hour, p, raw = TRUE, simple = TRUE))
  y_hat_test <- X_test%*%beta_hat
  RMSE_test[i] <- sqrt(sum((y_test - y_hat_test)^2)/length(y_test))
  
}

plot(1:10, RMSE_train, type = "b", col = "cornflowerblue", xlab = "Polynomial Order", ylab = "RMSE",
     main = "RMSE vs Polynomial Order", ylim=c(0,2))
lines(1:10, RMSE_test, type = "b", col = "lightcoral")
legend("topright", legend = c("Train RMSE", "Test RMSE"), col = c("cornflowerblue", "lightcoral"), lty = 1)

As we increase the order of the polynomial we can see both the training and the test error are declining, with the test error being consistently above the training error. The decrease in test error appears to plateau around order 8, meaning anything above this value would be an unnecessarily complex model and thus would be over fitting the training data. Therefore an 8th order polynomial regression seems to be a good fit for this example.

1.3

y_test <- bike_data_test$log_cnt
y_train <- bike_data_train$log_cnt
p <- 8
X_train <- cbind(1, poly(bike_data_train$hour, p, raw = TRUE, simple = TRUE))
beta_hat <- solve(t(X_train)%*%X_train)%*%t(X_train)%*%y_train

# Predict in-sample and compute RMSE
y_hat_train <- X_train%*%beta_hat 
RMSE_train <- sqrt(sum((y_train - y_hat_train)^2)/length(y_train))

# Predict out-of-sample and compute RMSE
X_test <- cbind(1, poly(bike_data_test$hour, p, raw = TRUE, simple = TRUE))
y_hat_test <- X_test%*%beta_hat
RMSE_test <- sqrt(sum((y_test - y_hat_test)^2)/length(y_test))

#Nonparametric local regression

loess_model <- loess(log_cnt ~ hour, data = bike_data_train)
loess_test <- predict(loess_model, bike_data_test)
loess_RMSE <- sqrt(sum((y_test - loess_test)^2)/length(y_test))

local_plot <- predict(loess_model, data.frame(hour = seq(0, 1, length.out=1000), se = TRUE))

# Plot training data, test data, and fit on a fine grid.
plot(log_cnt ~ hour, data = bike_data_train, col = "cornflowerblue", ylim = c(0, 8))
lines(bike_data_test$hour, bike_data_test$log_cnt, type = "p", col = "lightcoral")
hours_grid <- seq(0, 1, length.out = 1000)
X_grid <- cbind(1, poly(hours_grid, p, raw = TRUE, simple = TRUE))
y_hat_grid <- X_grid%*%beta_hat
lines(hours_grid, y_hat_grid, lty = 1, col = "lightcoral")
lines(hours_grid, local_plot, lty = 1, col = "green")
legend(x = "topleft", pch = c(1, 1, NA, NA), lty = c(NA, NA, 1, 1), col = c("cornflowerblue", "lightcoral",  "lightcoral", "green"), legend=c("Train", "Test", "Poly Fitted curve", "Smoothed Fit"))

cat("The RMSE for Polynomial Regression is ", RMSE_test)
The RMSE for Polynomial Regression is  1.021448
cat("\nThe RMSE for Loess Fit is ", loess_RMSE)

The RMSE for Loess Fit is  1.120946

When comparing results on the test data for each model we can see the RMSE for the Polynomial regression comes out to be: 1.021448 compared to 1.120946 for the locally fitted method. Therefore with the standard settings the polynomial regression outperforms the locally fitted model on the test data.

Q2

set.seed(123)

suppressMessages(library(splines))
knots <- seq(0.05, 0.95, length.out = 25)
X_train <- ns(bike_data_train$hour, knots = knots, intercept = TRUE)
X_test <- ns(bike_data_test$hour, knots = knots, intercept = TRUE)
beta_hat <- solve(t(X_train)%*%X_train)%*%t(X_train)%*%y_train

# Plot training data, test data, and spline fit on a fine grid.
plot(log_cnt ~ hour, data = bike_data_train, col = "cornflowerblue", ylim = c(0, 8))
lines(bike_data_test$hour, bike_data_test$log_cnt, type = "p", col = "lightcoral")
hours_grid <- seq(0, 1, length.out = 1000)
X_grid <- ns(hours_grid, knots = knots, intercept = TRUE) # cbind(1, ns(hours_grid, knots = knots))
y_hat_spline_grid <- X_grid%*%beta_hat
lines(hours_grid, y_hat_spline_grid, lty = 1, col = "lightcoral")
legend(x = "topleft", pch = c(1, 1, NA), lty = c(NA, NA, 1), col = c("cornflowerblue", "lightcoral",  "lightcoral"), legend=c("Train", "Test", "Spline"))

2.1

I_mat <- diag(ncol(X_train))

objective_func <- function(lambda) sqrt(mean((y_test - X_test %*% (solve(t(X_train) %*% X_train + lambda * I_mat) %*% t(X_train) %*% y_train))^2))

optimal_lam <- optim(par = 0.5, fn = objective_func, method = "L-BFGS-B", lower = 0, upper = 1)

beta_hat_new = solve(t(X_train) %*% X_train + optimal_lam$par * I_mat) %*% t(X_train) %*% y_train

plot(log_cnt ~ hour, data = bike_data_train, col = "cornflowerblue", ylim = c(0, 8), main = "Spline Fit with Optimal Lambda")
lines(bike_data_test$hour, bike_data_test$log_cnt, type = "p", col = "lightcoral")
hours_grid <- seq(0, 1, length.out = 1000)
X_grid <- ns(hours_grid, knots = knots, intercept = TRUE) # cbind(1, ns(hours_grid, knots = knots))
y_hat_spline_grid <- X_grid%*%beta_hat_new
lines(hours_grid, y_hat_spline_grid, lty = 1, col = "lightcoral")
legend(x = "topleft", pch = c(1, 1, NA), lty = c(NA, NA, 1), col = c("cornflowerblue", "lightcoral",  "lightcoral"), legend=c("Train", "Test", "Spline"))

2.2

# Q2.2

ridge_model <- glmnet(X_train, y_train, alpha = 0)

cv <- cv.glmnet(X_train, y_train, alpha = 0)

optimal_lambda_ridge <- cv$lambda.1se

ridge_train <- predict(ridge_model, s = optimal_lambda_ridge, newx = X_train)
ridge_test <- predict(ridge_model, s = optimal_lambda_ridge, newx = X_test)

ridge_train_plot <- predict(ridge_model, s = optimal_lambda_ridge, newx = X_grid)

ridge_RMSE_train <- sqrt(sum((y_train - ridge_train)^2)/length(y_train))
ridge_RMSE_predict <- sqrt(sum((y_test - ridge_test)^2)/length(y_test))

plot(log_cnt ~ hour, data = bike_data_train, col = "cornflowerblue", ylim = c(0, 8), main = "Ridge Regression with 1 SE Optimal Lambda")
lines(bike_data_test$hour, bike_data_test$log_cnt, type = "p", col = "lightcoral")
hours_grid <- seq(0, 1, length.out = 1000)
X_grid <- ns(hours_grid, knots = knots, intercept = TRUE) # cbind(1, ns(hours_grid, knots = knots))
y_hat_spline_grid <- X_grid%*%beta_hat_new
lines(hours_grid, ridge_train_plot, type = "l", col = "green")
legend(x = "topleft", pch = c(1, 1, NA, NA), lty = c(NA, NA, 1, 1), col = c("cornflowerblue", "lightcoral", "green"), legend=c("Train", "Test" ,"Ridge Fit"))

cat("The RMSE for the training set is: ", ridge_RMSE_train)
The RMSE for the training set is:  0.7111962
cat("\nThe RMSE for test set is ", ridge_RMSE_predict)

The RMSE for test set is  1.000773

2.3

# Q2.3

min_optimal_lambda <- cv$lambda.min

ridge_train_min <- predict(ridge_model, s = min_optimal_lambda, newx = X_train)
ridge_test_min <- predict(ridge_model, s = min_optimal_lambda, newx = X_test)

ridge_RMSE_train_min <- sqrt(sum((y_train - ridge_train_min)^2)/length(y_train))
ridge_RMSE_predict_min <- sqrt(sum((y_test - ridge_test_min)^2)/length(y_test))

cat("The RMSE for the training set is: ", ridge_RMSE_train_min)
The RMSE for the training set is:  0.6919639
cat("\nThe RMSE for test set is ", ridge_RMSE_predict_min)

The RMSE for test set is  1.002668

Previous results:

cat("The RMSE for the training set is: ", ridge_RMSE_train)
The RMSE for the training set is:  0.7111962
cat("\nThe RMSE for test set is ", ridge_RMSE_predict)

The RMSE for test set is  1.000773

When comparing the model performance of the two models we can see that the minimum lambda model has the lower training error however the 1se lambda model has a slightly lower test error, meaning the simpler model has generalized better to the test data.

2.4

lasso_model <- glmnet(X_train, y_train, alpha = 1)

cv_lasso <- cv.glmnet(X_train, y_train, alpha = 1)

optimal_lambda_lasso <- cv_lasso$lambda.1se

lasso_train <- predict(lasso_model, s = optimal_lambda_lasso, newx = X_train)
lasso_test <- predict(lasso_model, s = optimal_lambda_lasso, newx = X_test)

lasso_RMSE_train <- sqrt(sum((y_train - lasso_train)^2)/length(y_train))
lasso_RMSE_predict <- sqrt(sum((y_test - lasso_test)^2)/length(y_test))

cat("The Lasso Model RMSE for the training set is: ", lasso_RMSE_train)
The Lasso Model RMSE for the training set is:  0.7187198
cat("\nThe Lasso Model RMSE for test set is ", lasso_RMSE_predict)

The Lasso Model RMSE for test set is  1.006538

Both the Lasso and Ridge models produce comparable performance on the training and data sets, with the Ridge regression model using a 1 standard deviation lambda input being the best performing model in terms of out of sample performance.

Q3

# One hot for weathersit
one_hot_encode_weathersit <- model.matrix(~ as.factor(weathersit) - 1,data = bike_data)
one_hot_encode_weathersit  <- one_hot_encode_weathersit[, -1] # Remove reference category
colnames(one_hot_encode_weathersit) <- c('cloudy', 'light rain', 'heavy rain')
bike_data <- cbind(bike_data, one_hot_encode_weathersit)

# One hot for weekday
one_hot_encode_weekday <- model.matrix(~ as.factor(weekday) - 1,data = bike_data)
one_hot_encode_weekday  <- one_hot_encode_weekday[, -1] # Remove reference category
colnames(one_hot_encode_weekday) <- c('Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat')
bike_data <- cbind(bike_data, one_hot_encode_weekday)

# One hot for weekday
one_hot_encode_season <- model.matrix(~ as.factor(season) - 1,data = bike_data)
one_hot_encode_season  <- one_hot_encode_season[, -1] # Remove reference category
colnames(one_hot_encode_season) <- c('Spring', 'Summer', 'Fall')
bike_data <- cbind(bike_data, one_hot_encode_season)

head(bike_data)
  instant     dteday season yr mnth hr holiday weekday workingday weathersit
1       1 2011-01-01      1  0    1  0       0       6          0          1
2       2 2011-01-01      1  0    1  1       0       6          0          1
3       3 2011-01-01      1  0    1  2       0       6          0          1
4       4 2011-01-01      1  0    1  3       0       6          0          1
5       5 2011-01-01      1  0    1  4       0       6          0          1
6       6 2011-01-01      1  0    1  5       0       6          0          2
  temp  atemp  hum windspeed casual registered cnt  log_cnt       hour cloudy
1 0.24 0.2879 0.81    0.0000      3         13  16 2.772589 0.00000000      0
2 0.22 0.2727 0.80    0.0000      8         32  40 3.688879 0.04347826      0
3 0.22 0.2727 0.80    0.0000      5         27  32 3.465736 0.08695652      0
4 0.24 0.2879 0.75    0.0000      3         10  13 2.564949 0.13043478      0
5 0.24 0.2879 0.75    0.0000      0          1   1 0.000000 0.17391304      0
6 0.24 0.2576 0.75    0.0896      0          1   1 0.000000 0.21739130      1
  light rain heavy rain Mon Tue Wed Thu Fri Sat Spring Summer Fall
1          0          0   0   0   0   0   0   1      0      0    0
2          0          0   0   0   0   0   0   1      0      0    0
3          0          0   0   0   0   0   0   1      0      0    0
4          0          0   0   0   0   0   0   1      0      0    0
5          0          0   0   0   0   0   0   1      0      0    0
6          0          0   0   0   0   0   0   1      0      0    0

3.1

# Remove One Hot Encoded Variables
design_data <- select(bike_data, -weathersit, -weekday, -season)
  
head(design_data)
  instant     dteday yr mnth hr holiday workingday temp  atemp  hum windspeed
1       1 2011-01-01  0    1  0       0          0 0.24 0.2879 0.81    0.0000
2       2 2011-01-01  0    1  1       0          0 0.22 0.2727 0.80    0.0000
3       3 2011-01-01  0    1  2       0          0 0.22 0.2727 0.80    0.0000
4       4 2011-01-01  0    1  3       0          0 0.24 0.2879 0.75    0.0000
5       5 2011-01-01  0    1  4       0          0 0.24 0.2879 0.75    0.0000
6       6 2011-01-01  0    1  5       0          0 0.24 0.2576 0.75    0.0896
  casual registered cnt  log_cnt       hour cloudy light rain heavy rain Mon
1      3         13  16 2.772589 0.00000000      0          0          0   0
2      8         32  40 3.688879 0.04347826      0          0          0   0
3      5         27  32 3.465736 0.08695652      0          0          0   0
4      3         10  13 2.564949 0.13043478      0          0          0   0
5      0          1   1 0.000000 0.17391304      0          0          0   0
6      0          1   1 0.000000 0.21739130      1          0          0   0
  Tue Wed Thu Fri Sat Spring Summer Fall
1   0   0   0   0   1      0      0    0
2   0   0   0   0   1      0      0    0
3   0   0   0   0   1      0      0    0
4   0   0   0   0   1      0      0    0
5   0   0   0   0   1      0      0    0
6   0   0   0   0   1      0      0    0
#Filter for Relevant Dates
design_data_train = design_data[design_data$dteday >= as.Date("2011-01-01") & design_data$dteday <=  as.Date("2012-05-31"), ]
design_data_test = design_data[design_data$dteday >= as.Date("2012-06-01") & design_data$dteday <=  as.Date("2012-12-31"), ]

#Insert Spline
spline_basis <- ns(design_data_train$hour, df = 10, intercept = FALSE)
knots <- attr(spline_basis, "knots")
spline_basis_test <- ns(design_data_test$hour, df = 10, knots = knots, intercept = FALSE)

design_data_train$hour <- spline_basis
design_data_test$hour <- spline_basis_test

# Create Feature Filter
features <- c("hour", "yr", "holiday", "workingday", "temp", "atemp", "hum", "windspeed","cloudy","light rain","heavy rain","Mon","Tue","Wed","Thu","Fri","Sat","Spring","Summer","Fall")

design_matrix_train <- design_data_train[features]
design_matrix_test <- design_data_test[features]

X_train <- model.matrix(~ ., data = design_matrix_train)
y_train <- design_data_train$log_cnt

lasso_model <- glmnet(X_train, y_train, alpha = 1)

cv_lasso <- cv.glmnet(X_train, y_train, alpha = 1)

optimal_lambda_lasso <- cv_lasso$lambda.1se

X_test <- model.matrix(~ ., data = design_matrix_test)
y_test <- design_data_test$log_cnt

lasso_train <- predict(lasso_model, s = optimal_lambda_lasso, newx = X_train)
lasso_test <- predict(lasso_model, s = optimal_lambda_lasso, newx = X_test)

lasso_RMSE_train <- sqrt(sum((y_train - lasso_train)^2)/length(y_train))
lasso_RMSE_predict <- sqrt(sum((y_test - lasso_test)^2)/length(y_test))

cat("The Lasso Model RMSE for the training set with extra features is: ", lasso_RMSE_train)
The Lasso Model RMSE for the training set with extra features is:  0.6509982
cat("\nThe Lasso Model RMSE for test set with extra features is ", lasso_RMSE_predict)

The Lasso Model RMSE for test set with extra features is  0.6176499

3.2

plot(lasso_model, xvar = "lambda", main = "Lasso penalty\n\n", label = TRUE, ylim= c(-1,1.5))

coef(lasso_model, s = 0.5)
31 x 1 sparse Matrix of class "dgCMatrix"
                       s1
(Intercept)   4.716084663
(Intercept)   .          
hour1        -2.446887787
hour2         .          
hour3         .          
hour4         .          
hour5         .          
hour6         .          
hour7         .          
hour8         .          
hour9        -1.062296132
hour10        .          
yr            .          
holiday       .          
workingday    .          
temp          .          
atemp         0.009763773
hum           .          
windspeed     .          
cloudy        .          
`light rain`  .          
`heavy rain`  .          
Mon           .          
Tue           .          
Wed           .          
Thu           .          
Fri           .          
Sat           .          
Spring        .          
Summer        .          
Fall          .          

By plotting the coefficient values against the value of Lambda we can see that there are 3 features that go to 0 last looking at the right side of the plot. We can Access those coefficients via the coef function used above with a large enough lambda as the input, the resulting table tells us that hour1 (feature 1), hour9 (feature 9), and atemp (feature 15) are the most important features for the prescribed Lasso model.

3.3

# Q3.3

residuals_train <- y_train - lasso_train
acf_residuals_train <- acf(residuals_train)

residuals_test <- y_test - lasso_test
acf_residuals_test <- acf(residuals_test)

When plotting the autocorrelation function of both the training and test residuals we can see evidence of significant positive autocorrelation up to around lag 4. This is evidence of the independently and identically distributed errors condition being violated.

Q4

4.1

response <- design_data_test[design_data_test$dteday >= as.Date("2012-12-25") & design_data_test$dteday <= as.Date("2012-12-31"),]
response <- response$hr

corresponding_values <- tail(lasso_test, length(response))

plot(response, corresponding_values,  xlab = "Hour", ylab = "Fitted Values", main = "Actual vs. Fitted Values", col = "cornflowerblue", pch = 16)

4.2

lagged_training <- design_data_train %>%
  tk_augment_lags(contains("log_cnt"), .lags = 1:4)

lag_24 <- design_data_train %>%
  tk_augment_lags(contains("log_cnt"), .lags = 24)

lagged_training <- cbind(lagged_training, lag_24["log_cnt_lag24"])

lagged_test <- design_data_test %>%
  tk_augment_lags(contains("log_cnt"), .lags = 1:4)

lag_24_test <- design_data_test %>%
  tk_augment_lags(contains("log_cnt"), .lags = 24)

lagged_test <- cbind(lagged_test, lag_24_test["log_cnt_lag24"])

features <- c("hour", "yr", "holiday", "workingday", "temp", "atemp", "hum", "windspeed","cloudy","light rain","heavy rain","Mon","Tue","Wed","Thu","Fri","Sat","Spring","Summer","Fall", "log_cnt_lag1", "log_cnt_lag2", "log_cnt_lag3", "log_cnt_lag4", "log_cnt_lag24")

lagged_train_matrix <- lagged_training[features]
lagged_test_matrix <- lagged_test[features]

lagged_train_matrix <- na.omit(lagged_train_matrix)
lagged_test_matrix <- na.omit(lagged_test_matrix)

X_train <- model.matrix(~ ., data = lagged_train_matrix)
y_train <- design_data_train$log_cnt
y_train_lagged <- y_train[25:length(y_train)]

lagged_lasso_model <- glmnet(X_train, y_train_lagged, alpha = 1)

cv_lagged_lasso <- cv.glmnet(X_train, y_train_lagged, alpha = 1)

optimal_lagged_lasso <- cv_lagged_lasso$lambda.1se

X_test <- model.matrix(~ ., data = lagged_test_matrix)
y_test <- design_data_test$log_cnt
y_test_lagged <- y_test[25:length(y_test)]

lagged_lasso_train <- predict(lagged_lasso_model, s = optimal_lagged_lasso, newx = X_train)
lagged_lasso_test <- predict(lagged_lasso_model, s = optimal_lagged_lasso, newx = X_test)

lagged_lasso_RMSE_train <- sqrt(sum((y_train_lagged - lagged_lasso_train)^2)/length(y_train_lagged))
lagged_lasso_RMSE_predict <- sqrt(sum((y_test_lagged - lagged_lasso_test)^2)/length(y_test_lagged))

cat("The Lasso Model RMSE for the training set with lags is: ", lagged_lasso_RMSE_train)
The Lasso Model RMSE for the training set with lags is:  0.4242765
cat("\nThe Lasso Model RMSE for test set with extra lags is ", lagged_lasso_RMSE_predict)

The Lasso Model RMSE for test set with extra lags is  0.3615772
lagged_residuals_train <- y_train_lagged - lagged_lasso_train
lagged_acf_residuals_train <- acf(lagged_residuals_train)

lagged_residuals_test <- y_test_lagged - lagged_lasso_test
lagged_acf_residuals_test <- acf(lagged_residuals_test)

When comparing the RMSE values and ACF plots for the lagged lasso model we can see the addition of lags in our model has led to a significant improvement in performance on both the training and test data, as well as reducing the autocorrelation between residuals leading to a much more adequate model.

4.3

lagged_values <- tail(lagged_lasso_test, length(response))

plot(response, corresponding_values,  xlab = "Actual Counts", ylab = "Fitted Values", 
     main = "Actual vs. Fitted Values", col = "cornflowerblue", pch = 16)
points(response, lagged_values, col = "lightcoral", pch = 16)
legend(x = "bottomright", pch = c(16, 16), col = c("cornflowerblue", "lightcoral"), legend=c("Original Fit", "Lagged Fit"))

Q5

5.1

df <- as.data.frame(X_train)
df_test <- as.data.frame(X_test)

train_tree_df <- setNames(df, c("Intercept", paste0("Var", 1:34)))
test_tree_df <- setNames(df_test, c("Intercept", paste0("Var", 1:34)))

tree_model <- tree(y_train_lagged ~ ., train_tree_df)

tree_predict <- predict(tree_model, newdata = test_tree_df)

tree_RMSE <- sqrt(sum((y_test_lagged - tree_predict)^2)/length(y_test_lagged))

cat("\nThe Tree Model RMSE for test set with extra lags is ", tree_RMSE)

The Tree Model RMSE for test set with extra lags is  0.5991466

5.2

plot(tree_model)
text(tree_model, pretty = 0)

5.3

tree_values <- tail(tree_predict, length(response))

plot(response, corresponding_values,  xlab = "Actual Counts", ylab = "Fitted Values", 
     main = "Actual vs. Fitted Values", col = "cornflowerblue", pch = 16)
points(response, lagged_values, col = "lightcoral", pch = 16)
points(response, tree_values, col = "green", pch = 16)
legend(x = "bottomright", pch = c(16, 16, 16), col = c("cornflowerblue", "lightcoral", "green"), legend=c("Original Fit", "Lagged Fit", "Tree Fit"))

Q6

rm(list=ls()) # Remove variables 
cat("\014") # Clean workspace
# Q6

load(file = 'spam_ham_emails.RData')
Spam_ham_emails[, -1] <- scale(Spam_ham_emails[, -1])
Spam_ham_emails['spam'] <- as.factor(Spam_ham_emails['spam'] == 1) # Changing from 1->TRUE, 0->FALSE
levels(Spam_ham_emails$spam) <- c("not spam", "spam")
head(Spam_ham_emails)
      spam       over     remove   internet        free        hpl          X.
1 not spam -0.3502281 -0.2917622 -0.2625330 -0.30134485  4.2669752 -0.32987644
2     spam  1.6583607 -0.2917622  0.4106637  1.03071019 -0.2992074  0.38364598
3 not spam -0.3502281 -0.2917622 -0.2625330 -0.30134485  2.6659926 -0.32987644
4     spam  0.6358064 -0.2917622 -0.2625330 -0.30134485 -0.2992074 -0.28451505
5     spam  0.3436480  0.1936234 -0.2625330 -0.07126262 -0.2992074 -0.06996793
6     spam -0.3502281  0.9855684  0.9841276 -0.30134485 -0.2992074  0.17522878
        X..1   CapRunMax CapRunTotal        our         hp   george      X1999
1 -0.3083214 -0.17021174  0.02096274 -0.4642639  3.9791176 -0.22787  4.9900587
2  0.8751730 -0.08811470  0.01271665 -0.4642639 -0.3287789 -0.22787 -0.3234205
3 -0.3083214 -0.22665345 -0.40618481 -0.4642639 -0.3287789 -0.22787  2.7702052
4 -0.3083214 -0.09837683 -0.12416847 -0.4642639 -0.3287789 -0.22787 -0.3234205
5  0.9239769  0.40959862  0.75651413 -0.1817414 -0.3287789 -0.22787 -0.3234205
6  1.3672790 -0.17021174 -0.05655052  0.6658261 -0.3287789 -0.22787 -0.3234205
           re       edu
1  0.59185916 -0.197366
2 -0.03086294 -0.197366
3 -0.29774385 -0.197366
4 -0.29774385 -0.197366
5 -0.29774385 -0.197366
6 -0.29774385 -0.197366
print(paste("Percentage of spam:", 100*mean(Spam_ham_emails$spam == "spam")))
[1] "Percentage of spam: 39.4044772875462"
set.seed(1234)
suppressMessages(library(caret))

train_obs <- createDataPartition(y = Spam_ham_emails$spam, p = .75, list = FALSE)
train <- Spam_ham_emails[train_obs, ]
test <- Spam_ham_emails[-train_obs, ]
# Confirm both training and test are balanced with respect to spam emails
print(paste("Percentage of training data consisting of spam emails:", 
            100*mean(train$spam == "spam")))
[1] "Percentage of training data consisting of spam emails: 39.4088669950739"
print(paste("Percentage of test data consisting of spam emails:", 
            100*mean(test$spam == "spam")))
[1] "Percentage of test data consisting of spam emails: 39.3913043478261"
glm_fit <- glm(spam ~ ., family = binomial, data = train)
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
y_prob_hat_test <- predict(glm_fit, newdata = test, type = "response")
threshold <- 0.5 # Predict spam if probability > threshold
y_hat_test <- as.factor(y_prob_hat_test > threshold)
levels(y_hat_test) <- c("not spam", "spam")
confusionMatrix(data = y_hat_test, test$spam, positive = "spam")
Confusion Matrix and Statistics

          Reference
Prediction not spam spam
  not spam      658   65
  spam           39  388
                                          
               Accuracy : 0.9096          
                 95% CI : (0.8915, 0.9255)
    No Information Rate : 0.6061          
    P-Value [Acc > NIR] : < 2e-16         
                                          
                  Kappa : 0.8087          
                                          
 Mcnemar's Test P-Value : 0.01423         
                                          
            Sensitivity : 0.8565          
            Specificity : 0.9440          
         Pos Pred Value : 0.9087          
         Neg Pred Value : 0.9101          
             Prevalence : 0.3939          
         Detection Rate : 0.3374          
   Detection Prevalence : 0.3713          
      Balanced Accuracy : 0.9003          
                                          
       'Positive' Class : spam            
                                          

6.1

confusion_matrix <- table(y_hat_test, test$spam)
print(confusion_matrix)
          
y_hat_test not spam spam
  not spam      658   65
  spam           39  388

6.2

classification_accuracy <- (confusion_matrix[2,2] + confusion_matrix[1,1])/nrow(test)

precision <- confusion_matrix[2,2] / (confusion_matrix[2,2] + confusion_matrix[2,1])

recall <- confusion_matrix[2,2] / (confusion_matrix[2,2] + confusion_matrix[1,2])

specificity <- confusion_matrix[1,1] / (confusion_matrix[1,1] + confusion_matrix[2,1])

cat("\nThe Accuracy is:", classification_accuracy)

The Accuracy is: 0.9095652
cat("\nThe Precision is ", precision)

The Precision is  0.9086651
cat("\nThe Sensitivity/Recall is ", recall)

The Sensitivity/Recall is  0.8565121
cat("\nThe Specificity is ", specificity)

The Specificity is  0.9440459

6.3

library(pROC)
Type 'citation("pROC")' for a citation.

Attaching package: 'pROC'
The following objects are masked from 'package:stats':

    cov, smooth, var
roc = roc(test$spam, y_prob_hat_test)
Setting levels: control = not spam, case = spam
Setting direction: controls < cases
plot(roc, print.auc = TRUE, auc.polygon = TRUE, auc.polygon.col = "cornflowerblue", print.thres = "best")

The ROC curve plots the true positive rate (or sensitivity) of the classifier against the false positive rate (1 - specificity) at different classification thresholds. The Area Under the Curve (AUC) of the ROC plot can be used as a criterion to measure the usefulness of the classification models ability/predictions, the closer to the upper left corner curve reaches the better the performance. A useless model would have an ROC curve that lies flat on the 45 degree line. In our case we have an AUC of 0.965, this means there is only a small overlap between the distributions of the TP and TN rates implied by our model and that our classifier has a high level of separability/performance.

Q7

7.1

tree_classifier <- tree(spam ~ ., data = train)

tree_classifier_test <- predict(tree_classifier, newdata = test, type = "class")

tree_result <- confusionMatrix(data = tree_classifier_test, test$spam, positive = "spam")

confusionMatrix(data = tree_classifier_test, test$spam, positive = "spam")
Confusion Matrix and Statistics

          Reference
Prediction not spam spam
  not spam      652   66
  spam           45  387
                                          
               Accuracy : 0.9035          
                 95% CI : (0.8849, 0.9199)
    No Information Rate : 0.6061          
    P-Value [Acc > NIR] : < 2e-16         
                                          
                  Kappa : 0.7962          
                                          
 Mcnemar's Test P-Value : 0.05765         
                                          
            Sensitivity : 0.8543          
            Specificity : 0.9354          
         Pos Pred Value : 0.8958          
         Neg Pred Value : 0.9081          
             Prevalence : 0.3939          
         Detection Rate : 0.3365          
   Detection Prevalence : 0.3757          
      Balanced Accuracy : 0.8949          
                                          
       'Positive' Class : spam            
                                          

7.2

plot(tree_classifier)
text(tree_classifier, pretty = 0)

Q8

8.1

train$spam <- ifelse(train$spam == "spam", 1, 0)
test$spam <- ifelse(test$spam == "spam", 1, 0)

normalized_train <- as.data.frame(apply(train[2:16], 2, normalize_vec))
Normalization Parameters
min: -0.350228110166595
max: 21.1234126863928
Normalization Parameters
min: -0.291762174668414
max: 18.2806239868079
Normalization Parameters
min: -0.262533023843959
max: 27.4382667500574
Normalization Parameters
min: -0.30134485023877
max: 12.00199991105
Normalization Parameters
min: -0.2992074132057
max: 18.4841509923503
Normalization Parameters
min: -0.329876440355473
max: 39.4876172202886
Normalization Parameters
min: -0.308321429912137
max: 21.2467324392595
Normalization Parameters
min: -0.262570910157928
max: 50.9865074826567
Normalization Parameters
min: -0.465556667844373
max: 16.1272291200046
Normalization Parameters
min: -0.464263942558744
max: 13.0522080341277
Normalization Parameters
min: -0.328778934272805
max: 12.1342049991491
Normalization Parameters
min: -0.227870044749662
max: 9.67029202594554
Normalization Parameters
min: -0.323420460683335
max: 15.9476333828254
Normalization Parameters
min: -0.297743847022006
max: 20.8748077306014
Normalization Parameters
min: -0.197366028859547
max: 24.0036422602496
normalized_test <- as.data.frame(apply(test[2:16], 2, normalize_vec))
Normalization Parameters
min: -0.350228110166595
max: 12.6873395163159
Normalization Parameters
min: -0.291762174668414
max: 18.2806239868079
Normalization Parameters
min: -0.262533023843959
max: 14.398196289436
Normalization Parameters
min: -0.30134485023877
max: 23.9178377507234
Normalization Parameters
min: -0.2992074132057
max: 9.94933567842119
Normalization Parameters
min: -0.329876440355473
max: 11.4089160284131
Normalization Parameters
min: -0.308321429912137
max: 24.1058273203968
Normalization Parameters
min: -0.262570910157928
max: 10.2099329945011
Normalization Parameters
min: -0.465556667844373
max: 25.6580619465605
Normalization Parameters
min: -0.464263942558744
max: 14.4053421924099
Normalization Parameters
min: -0.328778934272805
max: 11.6376002624837
Normalization Parameters
min: -0.227870044749662
max: 9.67029202594554
Normalization Parameters
min: -0.323420460683335
max: 10.397999633588
Normalization Parameters
min: -0.297743847022006
max: 10.6839110562178
Normalization Parameters
min: -0.197366028859547
max: 16.6500479048287
y_train <- train$spam
y_test <- test$spam

knn_manual <- function(X_train, X_test, y_train, k) {
  
  distances <- cdist(X_train, X_test, metric = "euclidean")
  y_pred <- c()
  
  for (i in 1:ncol(distances)) {
    pred <- distances[,i]
    
    ordered_pred <- order(pred)
    
    k_nearest_indices <- ordered_pred[1:k]
    
    k_nearest_labels <- y_train[k_nearest_indices]
    
    prediction <- as.numeric(names(which.max(table(k_nearest_labels))))
    
    y_pred[i] <- prediction
    
  }
  
  return(y_pred)
  
}

error_rate <- c()

for (k in seq(1, 30, by = 2)) {
  
  y_pred <- knn_manual(normalized_train, normalized_test, y_train, k)
  error_rate[k] <- mean(y_pred != y_test)
  
}

plot(1:29, error_rate, type = "b", col = "cornflowerblue", xlab = "Value of K", ylab = "Missclassification Rate", main = "Performance as a Function of K")

optimal_k <- which.min(error_rate)

cat("\nThe Optimal K is ", optimal_k)

The Optimal K is  21

From the graph and from the optimal_k output we can see the value of K that minimizes the missclassification rate is 21 in this case

8.2

y_pred_knn = knn_manual(normalized_train, normalized_test, y_train, optimal_k)  

# Calculate misclassification rates
missclassification_rate_knn = mean(y_pred_knn != y_test)
missclassification_rate_logistic = 1 - classification_accuracy
missclassification_rate_tree = 1 - tree_result$overall["Accuracy"]

# Print the results
cat("Misclassification Rate - KNN:", missclassification_rate_knn, "\n")
Misclassification Rate - KNN: 0.1052174 
cat("Misclassification Rate - Logistic:", missclassification_rate_logistic, "\n")
Misclassification Rate - Logistic: 0.09043478 
cat("Misclassification Rate - Decision Tree:", missclassification_rate_tree, "\n")
Misclassification Rate - Decision Tree: 0.09652174 

When comparing the miss classification rate between the three models on the test data, we can see that the logistic regression model performs best and the KNN model performs worst.